#libraries

#data:Rodents habitat

db<-read.csv("C:\\Users\\Montero\\Desktop\\Escritorio 2024\\Betametrica\\MODULO IV\\Bolger.csv")

attach(db)

#Modelo logit vs probit

logit=glm(RODENTSP~PERSHRUB+DISTX+AGE, data=db,family=binomial(link = "logit"))
summary(logit)
## 
## Call:
## glm(formula = RODENTSP ~ PERSHRUB + DISTX + AGE, family = binomial(link = "logit"), 
##     data = db)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -5.9099159  3.1125426  -1.899   0.0576 .
## PERSHRUB     0.0958695  0.0406119   2.361   0.0182 *
## DISTX        0.0003087  0.0007741   0.399   0.6900  
## AGE          0.0250077  0.0376618   0.664   0.5067  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 34.617  on 24  degrees of freedom
## Residual deviance: 19.358  on 21  degrees of freedom
## AIC: 27.358
## 
## Number of Fisher Scoring iterations: 5
probit<-glm(RODENTSP~PERSHRUB+DISTX+AGE, data=db,
             family=binomial(link="probit"))

summary (probit)
## 
## Call:
## glm(formula = RODENTSP ~ PERSHRUB + DISTX + AGE, family = binomial(link = "probit"), 
##     data = db)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -3.7370090  1.7574065  -2.126  0.03347 * 
## PERSHRUB     0.0592018  0.0220816   2.681  0.00734 **
## DISTX        0.0002038  0.0004366   0.467  0.64059   
## AGE          0.0184713  0.0209535   0.882  0.37803   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 34.617  on 24  degrees of freedom
## Residual deviance: 19.131  on 21  degrees of freedom
## AIC: 27.131
## 
## Number of Fisher Scoring iterations: 6
memisc::mtable(logit,probit, digits = 6, sdigits = 5,summary.stats = T)
## 
## Calls:
## logit: glm(formula = RODENTSP ~ PERSHRUB + DISTX + AGE, family = binomial(link = "logit"), 
##     data = db)
## probit: glm(formula = RODENTSP ~ PERSHRUB + DISTX + AGE, family = binomial(link = "probit"), 
##     data = db)
## 
## ===========================================
##                     logit       probit     
## -------------------------------------------
##   (Intercept)     -5.909916   -3.737009*   
##                   (3.112543)  (1.757406)   
##   PERSHRUB         0.095869*   0.059202**  
##                   (0.040612)  (0.022082)   
##   DISTX            0.000309    0.000204    
##                   (0.000774)  (0.000437)   
##   AGE              0.025008    0.018471    
##                   (0.037662)  (0.020953)   
## -------------------------------------------
##   Log-likelihood  -9.67879    -9.56556     
##   N               25          25           
## ===========================================
##   Significance: *** = p < 0.001;   
##                 ** = p < 0.01;   
##                 * = p < 0.05
#el antilog del los coefientes (b)de la tabla son los odd-ratios 

Bondad de Ajuste de los modelos Hosmer and Lemeshow goodness of fit (GOF) test

hl1<-hoslem.test(db$RODENTSP,fitted(logit),g=2)

hl2<-hoslem.test(db$RODENTSP,fitted(probit),g=2)

hl1
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  db$RODENTSP, fitted(logit)
## X-squared = 0.64712, df = 0, p-value < 2.2e-16
hl2
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  db$RODENTSP, fitted(probit)
## X-squared = 0.73995, df = 0, p-value < 2.2e-16
#Se rechaza hypothesis de H0 : hay bondad de ajuste

Evaluando por medio de la matris de confución

# Valor umbral (sin criterio técnico) el promedio de los 
#valores predichos

threshold_logit<-  mean(fitted(logit)) #definión del umbral sin criterio técnico 

threshold_probit<-  mean(fitted(probit))

threshold_logit
## [1] 0.48
threshold_probit
## [1] 0.4857771
#tabla de clasificación use library(QuantPsyc)
#para esta tabla de clasificación el valor UMBRAL ES DETERMINANTE EN LOS RESULTADOS!!!
# SI SE CAMBIA LOS RESULTADOS CAMBIAN 

CrossTable(db$RODENTSP,fitted(logit)>threshold_logit, expected=FALSE) #se puede hacer con solo esta linea
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  25 
## 
##  
##              | fitted(logit) > threshold_logit 
##  db$RODENTSP |     FALSE |      TRUE | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |        11 |         2 |        13 | 
##              |     1.901 |     2.419 |           | 
##              |     0.846 |     0.154 |     0.520 | 
##              |     0.786 |     0.182 |           | 
##              |     0.440 |     0.080 |           | 
## -------------|-----------|-----------|-----------|
##            1 |         3 |         9 |        12 | 
##              |     2.059 |     2.621 |           | 
##              |     0.250 |     0.750 |     0.480 | 
##              |     0.214 |     0.818 |           | 
##              |     0.120 |     0.360 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |        14 |        11 |        25 | 
##              |     0.560 |     0.440 |           | 
## -------------|-----------|-----------|-----------|
## 
## 
ClassLog(logit,db$RODENTSP, cut=threshold_logit)
## $rawtab
##        resp
##          0  1
##   FALSE 11  3
##   TRUE   2  9
## 
## $classtab
##        resp
##                 0         1
##   FALSE 0.8461538 0.2500000
##   TRUE  0.1538462 0.7500000
## 
## $overall
## [1] 0.8
## 
## $mcFadden
## [1] 0.4408127
CrossTable(db$RODENTSP,fitted(probit)>threshold_probit, expected=FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  25 
## 
##  
##              | fitted(probit) > threshold_probit 
##  db$RODENTSP |     FALSE |      TRUE | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |        11 |         2 |        13 | 
##              |     1.901 |     2.419 |           | 
##              |     0.846 |     0.154 |     0.520 | 
##              |     0.786 |     0.182 |           | 
##              |     0.440 |     0.080 |           | 
## -------------|-----------|-----------|-----------|
##            1 |         3 |         9 |        12 | 
##              |     2.059 |     2.621 |           | 
##              |     0.250 |     0.750 |     0.480 | 
##              |     0.214 |     0.818 |           | 
##              |     0.120 |     0.360 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |        14 |        11 |        25 | 
##              |     0.560 |     0.440 |           | 
## -------------|-----------|-----------|-----------|
## 
## 
ClassLog(probit,db$RODENTSP, cut=threshold_probit)
## $rawtab
##        resp
##          0  1
##   FALSE 11  3
##   TRUE   2  9
## 
## $classtab
##        resp
##                 0         1
##   FALSE 0.8461538 0.2500000
##   TRUE  0.1538462 0.7500000
## 
## $overall
## [1] 0.8
## 
## $mcFadden
## [1] 0.4473546
#Ahora evaluamos la capacidad predistiva del modelo usando otros criterios
# por ejemplo: la curva ROC
pred_logit<-prediction(logit$fitted.values, db$RODENTSP)

pred_probit<-prediction(probit$fitted.values, db$RODENTSP)

perf_logit<-performance(pred_logit,measure="tpr",#tpr=true positive rate
                  x.measure ="fpr")              # fpr= fall positive rate

perf_probit<-performance(pred_probit,measure="tpr",#tpr=true positive rate
                        x.measure ="fpr")       



plot(perf_logit, colorize=T,lty=3)
abline(0,1,col="black")

plot(perf_probit, colorize=T,lty=3)
abline(0,1,col="black")

#Recuperando el valor del àrea bajo de la curva: mas área mejor modelo

aucp_logit<-performance(pred_logit,measure="auc") #Area under the ROC curve

aucp_logit<-aucp_logit@y.values[[1]]
aucp_logit
## [1] 0.8910256
aucp_probit<-performance(pred_logit,measure="auc") #Area under the ROC curve

aucp_probit<-aucp_probit@y.values[[1]]
aucp_probit
## [1] 0.8910256

#Punto de corte óptimo

#Usado library Epi SOLO PARA LOGIT

ROC(form = RODENTSP~PERSHRUB+DISTX+AGE, plot="ROC")

#Punto de corte optimo SOLO PARA LOGIT

#es donde  la sensitividad y la especificidad de intersectan

ROC(form = RODENTSP~PERSHRUB+DISTX+AGE, plot="sp")#sp=especificidad

#=====para el PROBIT======

(perf1<-performance(pred_probit,"sens","spec")) 
## A performance instance
##   'Specificity' vs. 'Sensitivity' (alpha: 'Cutoff')
##   with 26 data points
sen<-slot(perf1,"y.values")[[1]]
esp<-slot(perf1,"x.values")[[1]]
alfa<-slot(perf1,"alpha.values")[[1]]

mat<-data.frame(sen,esp,alfa)
mat
##           sen        esp       alfa
## 1  0.00000000 1.00000000        Inf
## 2  0.08333333 1.00000000 0.98409915
## 3  0.16666667 1.00000000 0.97397108
## 4  0.25000000 1.00000000 0.97170427
## 5  0.33333333 1.00000000 0.96005790
## 6  0.41666667 1.00000000 0.95281624
## 7  0.50000000 1.00000000 0.94905729
## 8  0.58333333 1.00000000 0.93598925
## 9  0.66666667 1.00000000 0.91631191
## 10 0.66666667 0.92307692 0.72112136
## 11 0.75000000 0.92307692 0.60957106
## 12 0.75000000 0.84615385 0.52020615
## 13 0.75000000 0.76923077 0.38845386
## 14 0.75000000 0.69230769 0.37995355
## 15 0.83333333 0.69230769 0.32195291
## 16 0.91666667 0.69230769 0.31077470
## 17 0.91666667 0.61538462 0.31025353
## 18 0.91666667 0.53846154 0.23824176
## 19 0.91666667 0.46153846 0.19660582
## 20 1.00000000 0.46153846 0.11579631
## 21 1.00000000 0.38461538 0.10875966
## 22 1.00000000 0.30769231 0.07960922
## 23 1.00000000 0.23076923 0.07785613
## 24 1.00000000 0.15384615 0.04795703
## 25 1.00000000 0.07692308 0.03983344
## 26 1.00000000 0.00000000 0.03347414
#=====para el LOGIT======
(perf2<-performance(pred_logit,"sens","spec")) 
## A performance instance
##   'Specificity' vs. 'Sensitivity' (alpha: 'Cutoff')
##   with 26 data points
sen2<-slot(perf2,"y.values")[[1]]
esp2<-slot(perf2,"x.values")[[1]]
alfa2<-slot(perf2,"alpha.values")[[1]]

mat2<-data.frame(sen2,esp2,alfa2)
mat2
##          sen2       esp2      alfa2
## 1  0.00000000 1.00000000        Inf
## 2  0.08333333 1.00000000 0.97070655
## 3  0.16666667 1.00000000 0.96358890
## 4  0.25000000 1.00000000 0.95942634
## 5  0.33333333 1.00000000 0.94965637
## 6  0.41666667 1.00000000 0.93711358
## 7  0.50000000 1.00000000 0.93388591
## 8  0.58333333 1.00000000 0.91020726
## 9  0.66666667 1.00000000 0.90074886
## 10 0.66666667 0.92307692 0.71404230
## 11 0.75000000 0.92307692 0.62155265
## 12 0.75000000 0.84615385 0.51434036
## 13 0.75000000 0.76923077 0.39453758
## 14 0.75000000 0.69230769 0.38440837
## 15 0.83333333 0.69230769 0.34126211
## 16 0.91666667 0.69230769 0.32541846
## 17 0.91666667 0.61538462 0.29057568
## 18 0.91666667 0.53846154 0.22286175
## 19 0.91666667 0.46153846 0.16397005
## 20 0.91666667 0.38461538 0.10286861
## 21 1.00000000 0.38461538 0.09491477
## 22 1.00000000 0.30769231 0.09285278
## 23 1.00000000 0.23076923 0.07525586
## 24 1.00000000 0.15384615 0.04855852
## 25 1.00000000 0.07692308 0.04844846
## 26 1.00000000 0.00000000 0.03879793

#Ahora para graficar las dos curvas

library(reshape2)
library(gridExtra)
## 
## Adjuntando el paquete: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
m<-melt(mat,id=c("alfa"))
m2<-melt(mat2,id=c("alfa2"))

p1<-ggplot(m,aes(alfa,value,group=variable,
                 colour=variable))+
                 geom_line(size=1.2)+
  labs(title="punto de corte para logit")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p1

p2<-ggplot(m2,aes(alfa2,value,group=variable,
                 colour=variable))+
  geom_line(size=1.2)+
  labs(title="punto de corte para probit")
p2

library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
## 
## Adjuntando el paquete: 'plotly'
## The following objects are masked from 'package:memisc':
## 
##     rename, style
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
ggplotly(p1)
ggplotly(p2)
g1<-grid.arrange(p1,p2,ncol=2)

g1
## TableGrob (1 x 2) "arrange": 2 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]

Proyectando la probabilidad

names(db)
## [1] "PERSHRUB" "DISTX"    "AGE"      "RODENTSP"
newdata<-data.frame(PERSHRUB=50, DISTX=500, AGE=20, RODENTSP=1)
newdata
##   PERSHRUB DISTX AGE RODENTSP
## 1       50   500  20        1
predict(logit,newdata,type="response")
##         1 
## 0.3865283
predict(probit,newdata,type="response")
##         1 
## 0.3799659